The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.
Code
# Set the path to the 2016 folder#path <- "Data/SAR Vessel detections 2017-2020"# List all CSV files in the folder#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)# Read all CSV files and combine them into a single data frame#SAR_fishing <- SAR_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","SAR_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_SAR_fishing <- SAR_fishing %>%mutate(lat_rounded =round(lat, digits =2),lon_rounded =round(lon, digits =2) ) %>%group_by(lat_rounded, lon_rounded) %>%filter(fishing_score >=0.9) %>%summarise(total_presence_score =sum(presence_score, na.rm =TRUE),avg_fishing_score =mean(fishing_score, na.rm =TRUE),count =n() ) %>%mutate(total_presence_score =round(total_presence_score, digits =0)) %>%ungroup()# Standardize and aggregate SAR dataSAR_data_std <- aggregated_SAR_fishing %>%mutate(coords =map2(lon_rounded, lat_rounded, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_presence_score =sum(total_presence_score, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = SAR_data_std, aes(x = lon_std, y = lat_std, fill = total_presence_score)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Fishing vessel detections (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )
Compare AIS, and SAR detection locations
Identifying grid cells with only AIS, only SAR detections or both data types.
Code
# Merge the datasetscombined_data <-full_join( AIS_data_std, SAR_data_std,by =c("lon_std", "lat_std"))# Categorize each cellcombined_data <- combined_data %>%mutate(category =case_when( total_fishing_hours >0& total_presence_score >0~"Both AIS and SAR", total_fishing_hours >0& (is.na(total_presence_score) | total_presence_score ==0) ~"Only AIS", (is.na(total_fishing_hours) | total_fishing_hours ==0) & total_presence_score >0~"Only SAR",TRUE~"No fishing detected" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data, aes(x = lon_std, y = lat_std, fill = category)) +scale_fill_manual(values =c("Both AIS and SAR"="purple", "Only AIS"="blue", "Only SAR"="red", "No fishing detected"="white"),name ="Fishing Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Detection",subtitle ="Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Display the plotprint(world_plot)
Code
# Get summary statisticssummary_stats <- combined_data %>%count(category) %>%mutate(percentage = n /sum(n) *100) %>%rename(`Number of cells`= n) %>%mutate(percentage =round(percentage, 2))# Create and display the tablekable(summary_stats, format ="html", col.names =c("Category", "Number of cells", "Percentage (%)"),caption ="Summary Statistics of Data Categories") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(2, width ="150px") %>%column_spec(3, width ="150px")
Summary Statistics of Data Categories
Category
Number of cells
Percentage (%)
Both AIS and SAR
163095
9.12
Only AIS
1566190
87.60
Only SAR
58668
3.28
Random forest model
Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.
Code
# Open the saved adjusted rastersPorts_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))Shore_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))Bathy_adjusted <-raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))# Stack the resampled rastersraster_stack <-stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)# Convert the stack to a dataframeraster_df <-as.data.frame(raster_stack, xy =TRUE)# Rename the columnsnames(raster_df) <-c("x", "y", "dist_shore", "dist_ports", "bathy")# Remove NA values if desiredraster_df <-na.omit(raster_df)# Convert to data.table for efficiencysetDT(raster_df)# Round x and y to 1 decimal place for consistencyraster_df[, `:=`(lon_std =round(x, digits =1),lat_std =round(y, digits =1))]# Now proceed with the join and model trainingload(here::here("data","combined_data_O1deg.Rdata"))setDT(combined_data_01deg)# Perform the join using data.tablecombined_data_with_rasters <- raster_df[combined_data_01deg, on = .(lon_std, lat_std), nomatch =0]# Convert back to dataframe if neededcombined_data_with_rasters <-as.data.frame(combined_data_with_rasters)# Prepare the training datatraining_data <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Train the random forest model with timingset.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#save(rf_model_env, file = "rf_model_01deg_with_rasters.Rdata")load(here::here("rf_model_01deg_with_rasters.Rdata"))
Maps of predictions
Code
# Prepare the prediction dataprediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictionspredictions <-predict(rf_model_env, newdata = prediction_data)# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Create the world mapworld_map <-map_data("world")# Map of predicted fishing hours only # Prepare the data for the mapmap_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)# Function to create map for a specific regioncreate_region_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Predicted fishing hours (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")# Print the mapsprint(predicted_SAR_only_plot)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Code
#Map of both original and predicted AIS fishing hours # Visualize the resultspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)
Code
#Aggregate data to 0.5 degree resolution # First, round the coordinates to the nearest 0.5 degreecombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 )# Aggregate the dataaggregated_data <- combined_data_05deg %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),.groups ='drop' )save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))# Create the mappredicted_plot_05deg <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = aggregated_data, aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.5 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Print the mapprint(predicted_plot_05deg)
Code
# Caribbean mapcaribbean_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")# Asia mapasia_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")# Print the mapsprint(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Model performance
Code
evaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}validation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )# Evaluate all modelsvalidation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")results_rf_env <-evaluate_model(rf_model_env, validation_data)# Create a dataframe for the tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_env$`Mean Absolute Error`, results_rf_env$`Root Mean Squared Error`, results_rf_env$`Mean Absolute Percentage Error`, results_rf_env$`Median Absolute Error`, results_rf_env$`R-Squared`, results_rf_env$`Adjusted R-Squared`, results_rf_env$`Mean of Residuals`, results_rf_env$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Create and display the tablekable(results_table, format ="html", digits =4, caption ="Model Evaluation Metrics") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")
Model Evaluation Metrics
Metric
Value
Mean Absolute Error
174.91
Root Mean Squared Error
810.11
Mean Absolute Percentage Error
1250.65
Median Absolute Error
22.69
R-Squared
0.81
Adjusted R-Squared
0.81
Mean of Residuals
-3.60
Standard Deviation of Residuals
810.11
Code
# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_env$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]kable(feature_importance, format ="html", digits =4, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")
Feature Importance
Feature
%IncMSE
IncNodePurity
total_presence_score
total_presence_score
127.1595
737080845352
lon_std
lon_std
80.5654
557956385485
dist_ports
dist_ports
55.9307
312138319465
bathy
bathy
48.4341
238192589722
lat_std
lat_std
48.0964
540028288659
dist_shore
dist_shore
46.3306
213555877780
Test for spatial auto-correlation
Code
##------------ Create a non-random test set # Prepare the datadata <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Split the data into training and test setsset.seed(123) # for reproducibilitytrain_indices <-sample(1:nrow(data), 0.7*nrow(data))train_data <- data[train_indices, ]test_data <- data[-train_indices, ]# Train the random forest model#set.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#rf_model_env_train=rf_model_env#save(rf_model_env_train, file = "rf_model_01deg_with_rasters_train.Rdata")load(here::here("rf_model_01deg_with_rasters_train.Rdata"))# Make predictions on the test settest_data$predictions <-predict(rf_model_env_train, test_data)# Calculate residualstest_data$residuals <- test_data$total_fishing_hours - test_data$predictions# Prepare spatial data for autocorrelation analysistest_data_sf <-st_as_sf(test_data, coords =c("lon_std", "lat_std"), crs =4326)# Create a neighbors listk <-8# You can adjust this numbernb <-knearneigh(st_coordinates(test_data_sf), k = k)nb <-knn2nb(nb)# Create a spatial weights matrixlw <-nb2listw(nb, style="W")# Perform Moran's I test on the test set residualsmoran_test <-moran.test(test_data_sf$residuals, lw)# Create a data frame with the test results moran_results <-data.frame(Statistic =c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),Value =c(round(moran_test$estimate[1],2),round(moran_test$estimate[2],2),round(moran_test$estimate[3],2),round(moran_test$statistic,2),ifelse(moran_test$p.value <2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific =TRUE, digits =4)) ) )# Create and display the tablekable(moran_results, format ="html", col.names =c("Statistic", "Value"),caption ="Moran I Test under randomisation") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE,position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2, width ="15em")
Moran I Test under randomisation
Statistic
Value
Moran I statistic
Moran I statistic
0.25
Expectation
Expectation
0
Variance
Variance
0
Moran I statistic standard deviate
Standard deviate
115.35
P-value
< 2.2e-16
Spatial cross-validation
Code
# Assuming your training_data has lon_std (longitude) and lat_std (latitude) columnscoords <-SpatialPoints(training_data[, c("lon_std", "lat_std")])# Combine your training data with spatial coordinatesspatial_data <-SpatialPointsDataFrame(coords, data = training_data)#Testing to split the coordinates with k-means clustering# Ensure spatial_data is an sf objectif (!inherits(spatial_data, "sf")) { spatial_data_sf <-st_as_sf(spatial_data)} else { spatial_data_sf <- spatial_data}# Extract coordinatescoords <-st_coordinates(spatial_data_sf)# Perform k-means clusteringset.seed(123) # for reproducibilitykmeans_result <-kmeans(coords, centers =6)# Add cluster assignments to the sf objectspatial_data_sf$block <-as.factor(kmeans_result$cluster)# Create the plotggplot() +geom_sf(data = spatial_data_sf, aes(color = block), size =1) +scale_color_viridis_d(name ="Block") +theme_minimal() +labs(title ="Spatial Blocks for 6-Fold Cross-Validation",subtitle ="Points colored by block assignment") +theme(legend.position ="bottom")
Code
# Register cores for parallel processing (use 1 less than your total cores)#cl <- makeCluster(detectCores() - 1)#registerDoParallel(cl)# Initialize a list to store results#results <- foreach(i = 1:6, .packages = c("randomForest", "dplyr"), .export = "spatial_data_sf") %dopar% {# Get the indices of the training and test sets for the i-th block# test_indices <- spatial_data_sf$block == i# train_data <- spatial_data_sf[!test_indices, ]# test_data <- spatial_data_sf[test_indices, ]# Train Random Forest on the training data# rf_model <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )# Save the model# model_filename <- paste0("rf_model_fold_", i, ".rds")# saveRDS(rf_model, file = model_filename)# Predict on the test data# predictions <- predict(rf_model, test_data)# Return a list with results and model filename# list(# results = data.frame(observed = test_data$total_fishing_hours, predicted = predictions, fold = i),# model_filename = model_filename# )#}# Stop the cluster once done#stopCluster(cl)# Extract results and model filenames#results_df <- do.call(rbind, lapply(results, function(x) x$results))#model_filenames <- sapply(results, function(x) x$model_filename)# Save results#save(results_df, file = "spatial_cv_results.RData")#save(model_filenames, file = "model_filenames.RData")#load(here::here("spatial_cv_results.RData"))#load(here::here("model_filenames.RData"))# Ensure spatial_data and results_df are aligned#spatial_data <- spatial_data[order(row.names(spatial_data)), ]#results_df <- results_df[order(row.names(results_df)), ]# Get all coordinates#all_coordinates <- coordinates(spatial_data)# Create neighbor list for the entire dataset#k <- 30 # Number of nearest neighbors, adjust as needed#nb_all <- knn2nb(knearneigh(all_coordinates, k = k))# Create spatial weights for the entire dataset#lw_all <- nb2listw(nb_all, style="W", zero.policy = TRUE)# Function to calculate Moran's I#calculate_morans_i <- function(residuals, indices, lw_all) {# Create a logical vector for subsetting# subset_vector <- rep(FALSE, length(lw_all$neighbours))# subset_vector[indices] <- TRUE# Subset the weights list for the current fold# lw_subset <- subset.listw(lw_all, subset_vector, zero.policy = TRUE)# Calculate Moran's I# moran_result <- moran.test(residuals, lw_subset, zero.policy = TRUE)# return(moran_result)#}# Calculate Moran's I for each fold in the spatial cross-validation#morans_i_results <- list()#for (i in 1:6) {# Get the indices for this fold# fold_indices <- which(results_df$fold == i)# Calculate residuals# fold_residuals <- results_df$observed[fold_indices] - results_df$predicted[fold_indices]# Calculate Moran's I for this fold# morans_i_results[[i]] <- calculate_morans_i(fold_residuals, fold_indices, lw_all)# Print progress# print(paste("Completed fold", i))#}#save(morans_i_results, file="morans_i_results.Rdata")load(here::here("morans_i_results.Rdata"))# Print results#print("Moran's I for each fold in spatial cross-validation:")#for (i in 1:5) {# print(paste("Fold", i))# print(morans_i_results[[i]])#}# Extract relevant information from Moran's I resultsmorans_table <-data.frame(Fold =1:6,Moran_I =sapply(morans_i_results, function(x) x$estimate[1]),Expectation =sapply(morans_i_results, function(x) x$estimate[2]),Variance =sapply(morans_i_results, function(x) x$estimate[3]),Std_Deviate =sapply(morans_i_results, function(x) x$statistic),P_Value =sapply(morans_i_results, function(x) x$p.value))# Format the tablemorans_table_formatted <- morans_table %>%mutate(Moran_I =round(Moran_I, 4),Expectation =format(Expectation, scientific =TRUE, digits =4),Variance =format(Variance, scientific =TRUE, digits =4),Std_Deviate =round(Std_Deviate, 2),P_Value =ifelse(P_Value <2.2e-16, "< 2.2e-16", format(P_Value, scientific =TRUE, digits =4)) )# Create and print the table using kable and kable_stylingkable(morans_table_formatted,col.names =c("Fold", "Moran's I", "Expectation", "Variance", "Std. Deviate", "p-value"),caption ="Moran's I Results for Each Fold in Spatial Cross-Validation",align =c('c', 'r', 'r', 'r', 'r', 'r'),format ="html") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:6, width ="150px")
Moran's I Results for Each Fold in Spatial Cross-Validation
Fold
Moran's I
Expectation
Variance
Std. Deviate
p-value
1
0.4423
-7.330e-05
4.410e-06
210.64
< 2.2e-16
2
0.4504
-1.275e-05
8.039e-07
502.39
< 2.2e-16
3
0.3825
-7.548e-05
4.594e-06
178.51
< 2.2e-16
4
0.6267
-3.090e-05
1.907e-06
453.87
< 2.2e-16
5
0.3767
-9.321e-05
5.730e-06
157.42
< 2.2e-16
6
0.4118
-6.942e-05
4.234e-06
200.15
< 2.2e-16
Spatially blocked predictions
Code
# Load necessary librarieslibrary(raster)library(ggplot2)library(viridis)library(dplyr)library(data.table)# Load the model filenamesload(here::here("model_filenames.RData"))# Function to make predictions using all 6 modelsmake_predictions <-function(data) { predictions <-matrix(0, nrow =nrow(data), ncol =6)for (i in1:6) { model <-readRDS(here::here(model_filenames[i])) predictions[, i] <-predict(model, data) }# Take the average prediction across all modelsrowMeans(predictions)}# Prepare the prediction data (same as before)prediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictions using the spatially blocked modelsnew_predictions <-make_predictions(prediction_data)# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ new_predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Now you can use the same mapping code as before, but it will use the new predictions# Map of predicted fishing hours only map_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)# Use your create_region_map function to create the maps# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution (Spatially Blocked Model)")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution (Spatially Blocked Model)")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution (Spatially Blocked Model)")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution (Spatially Blocked Model)")# Print the mapsprint(predicted_SAR_only_plot)
Code
print(caribbean_map)
Code
print(indian_european_map)
Code
print(asia_map)
Code
# Create the global map of both original and predicted fishing hourspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Spatially Blocked Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)
Code
# Aggregate data to 0.5 degree resolutioncombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 ) %>%group_by(lon_05deg, lat_05deg) %>%summarize(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE) ) %>%ungroup()# Global map (0.5 degree)global_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-180, 180), c(-90, 90), "Global Fishing Hours", "0.5 degree resolution (Observed and Predicted)")# Caribbean map (0.5 degree)caribbean_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution (Observed and Predicted)")# Northwestern Indian Ocean to Western European waters map (0.5 degree)indian_european_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution (Observed and Predicted)")# Asia map (0.5 degree)asia_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution (Observed and Predicted)")# Print the mapsprint(global_map_05deg)
Code
print(caribbean_map_05deg)
Code
print(indian_european_map_05deg)
Code
print(asia_map_05deg)
Source Code
---title: "Global_fishing_watch_workflow_envpreds"author: "Théophile L. Mouton"date: "`r Sys.Date()`"format: html: toc: true toc-location: right css: custom.css output-file: "Global_Fishing_Watch_workflow_envpreds.html" self-contained: true code-fold: true code-tools: trueeditor: visualexecute: warning: falseparams: printlabels: TRUE---# Combining AIS and Sentinel-1 SAR fishing effort data from Global Fishing Watch## Open R libraries```{r}# Load required librarieslibrary(tidyverse)library(maps)library(ggplot2)library(data.table)library(gridExtra)library(raster)library(sf)library(viridis)library(scales)library(dplyr)library(randomForest)library(caret)library(pdp)library(knitr)library(kableExtra)library(future)library(spdep)library(ncf)library(blockCV)library(doParallel)library(sp)```## AIS dataData from Kroodsma et al. (2018) Science, accessible at: [Global Fishing Watch Data Download Portal](https://globalfishingwatch.org/data-download/datasets/public-fishing-effort).The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.```{r}# Set the path to the 2017-2020 folder#path <- "Data/AIS Fishing Effort 2017-2020"# List all CSV files in the folder#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)# Read all CSV files and combine them into a single data frame#AIS_fishing <- AIS_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","AIS_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_AIS_fishing <- AIS_fishing %>%group_by(cell_ll_lat, cell_ll_lon) %>%summarise(total_fishing_hours =sum(fishing_hours, na.rm =TRUE)) %>%ungroup() %>%# Remove any cells with zero or negative fishing hoursfilter(total_fishing_hours >0)# Function to standardize coordinates to 0.1 degree resolutionstandardize_coords <-function(lon, lat) {list(lon_std =floor(lon *10) /10,lat_std =floor(lat *10) /10 )}# Standardize and aggregate AIS dataAIS_data_std <- aggregated_AIS_fishing %>%mutate(coords =map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_fishing_hours =sum(total_fishing_hours, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = AIS_data_std, aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )```## Sentinel-1 SAR dataData from Paolo et al. 2024 Nature, accessible at: [Global Fishing Watch SAR Vessel Detections](https://globalfishingwatch.org/data-download/datasets/public-sar-vessel-detections:v20231026)The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.```{r}# Set the path to the 2016 folder#path <- "Data/SAR Vessel detections 2017-2020"# List all CSV files in the folder#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)# Read all CSV files and combine them into a single data frame#SAR_fishing <- SAR_csv_files %>%# map_df(~read_csv(.))load(here::here("Data","SAR_fishing.Rdata"))# Aggregate fishing hours by latitude and longitudeaggregated_SAR_fishing <- SAR_fishing %>%mutate(lat_rounded =round(lat, digits =2),lon_rounded =round(lon, digits =2) ) %>%group_by(lat_rounded, lon_rounded) %>%filter(fishing_score >=0.9) %>%summarise(total_presence_score =sum(presence_score, na.rm =TRUE),avg_fishing_score =mean(fishing_score, na.rm =TRUE),count =n() ) %>%mutate(total_presence_score =round(total_presence_score, digits =0)) %>%ungroup()# Standardize and aggregate SAR dataSAR_data_std <- aggregated_SAR_fishing %>%mutate(coords =map2(lon_rounded, lat_rounded, standardize_coords)) %>%mutate(lon_std =map_dbl(coords, ~ .x$lon_std),lat_std =map_dbl(coords, ~ .x$lat_std) ) %>%group_by(lon_std, lat_std) %>%summarise(total_presence_score =sum(total_presence_score, na.rm =TRUE), .groups ="drop")# Create the world mapworld_map <-map_data("world")# Create the plotggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = SAR_data_std, aes(x = lon_std, y = lat_std, fill = total_presence_score)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Fishing vessel detections (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title =NULL,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )```## Compare AIS, and SAR detection locationsIdentifying grid cells with only AIS, only SAR detections or both data types.```{r}# Merge the datasetscombined_data <-full_join( AIS_data_std, SAR_data_std,by =c("lon_std", "lat_std"))# Categorize each cellcombined_data <- combined_data %>%mutate(category =case_when( total_fishing_hours >0& total_presence_score >0~"Both AIS and SAR", total_fishing_hours >0& (is.na(total_presence_score) | total_presence_score ==0) ~"Only AIS", (is.na(total_fishing_hours) | total_fishing_hours ==0) & total_presence_score >0~"Only SAR",TRUE~"No fishing detected" ))# Create the world mapworld_map <-map_data("world")# Create the plotworld_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long = long, lat = lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data, aes(x = lon_std, y = lat_std, fill = category)) +scale_fill_manual(values =c("Both AIS and SAR"="purple", "Only AIS"="blue", "Only SAR"="red", "No fishing detected"="white"),name ="Fishing Data Source",guide =guide_legend(title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Detection",subtitle ="Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Display the plotprint(world_plot)# Get summary statisticssummary_stats <- combined_data %>%count(category) %>%mutate(percentage = n /sum(n) *100) %>%rename(`Number of cells`= n) %>%mutate(percentage =round(percentage, 2))# Create and display the tablekable(summary_stats, format ="html", col.names =c("Category", "Number of cells", "Percentage (%)"),caption ="Summary Statistics of Data Categories") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(2, width ="150px") %>%column_spec(3, width ="150px")```## Random forest modelPredicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of \>160,000 associations between SAR vessel detections and AIS fishing hours globally, geographical coordinates, distance to ports, distance to shore and bathymetry.```{r}# Open the saved adjusted rastersPorts_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-port-0.1deg-adjusted.tif"))Shore_adjusted <-raster(here::here("Data", "Environmental data layers", "distance-from-shore-0.1deg-adjusted.tif"))Bathy_adjusted <-raster(here::here("Data", "Environmental data layers", "bathymetry-0.1deg-adjusted.tif"))# Stack the resampled rastersraster_stack <-stack(Shore_adjusted, Ports_adjusted, Bathy_adjusted)# Convert the stack to a dataframeraster_df <-as.data.frame(raster_stack, xy =TRUE)# Rename the columnsnames(raster_df) <-c("x", "y", "dist_shore", "dist_ports", "bathy")# Remove NA values if desiredraster_df <-na.omit(raster_df)# Convert to data.table for efficiencysetDT(raster_df)# Round x and y to 1 decimal place for consistencyraster_df[, `:=`(lon_std =round(x, digits =1),lat_std =round(y, digits =1))]# Now proceed with the join and model trainingload(here::here("data","combined_data_O1deg.Rdata"))setDT(combined_data_01deg)# Perform the join using data.tablecombined_data_with_rasters <- raster_df[combined_data_01deg, on = .(lon_std, lat_std), nomatch =0]# Convert back to dataframe if neededcombined_data_with_rasters <-as.data.frame(combined_data_with_rasters)# Prepare the training datatraining_data <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Train the random forest model with timingset.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = training_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#save(rf_model_env, file = "rf_model_01deg_with_rasters.Rdata")load(here::here("rf_model_01deg_with_rasters.Rdata"))```### Maps of predictions ```{r}# Prepare the prediction dataprediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictionspredictions <-predict(rf_model_env, newdata = prediction_data)# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Create the world mapworld_map <-map_data("world")# Map of predicted fishing hours only # Prepare the data for the mapmap_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)# Function to create map for a specific regioncreate_region_map <-function(data, world_map, lon_col, lat_col, lon_range, lat_range, title, subtitle) {ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="darkgray", fill ="lightgray", size =0.1) +geom_tile(data = data, aes(x = .data[[lon_col]], y = .data[[lat_col]], fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="Predicted fishing hours (2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3, xlim = lon_range, ylim = lat_range) +theme_minimal() +labs(title = title,subtitle = subtitle,x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )}# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution")# Print the mapsprint(predicted_SAR_only_plot)print(caribbean_map)print(indian_european_map)print(asia_map)#Map of both original and predicted AIS fishing hours # Visualize the resultspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)#Aggregate data to 0.5 degree resolution # First, round the coordinates to the nearest 0.5 degreecombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 )# Aggregate the dataaggregated_data <- combined_data_05deg %>%group_by(lon_05deg, lat_05deg) %>%summarise(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE),.groups ='drop' )save(aggregated_data, file=here::here("Predicted_Fishing_Hours_05Deg.Rdata"))# Create the mappredicted_plot_05deg <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = aggregated_data, aes(x = lon_05deg, y = lat_05deg, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.5 degree resolution)",subtitle ="Based on AIS data and Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )# Print the mapprint(predicted_plot_05deg)# Caribbean mapcaribbean_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution")# Asia mapasia_map <-create_region_map(aggregated_data, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution")# Print the mapsprint(caribbean_map)print(indian_european_map)print(asia_map)```### Model performance```{r}evaluate_model <-function(model, data, log_target =FALSE) { predictions <-predict(model, newdata = data)if (log_target) { predictions <-10^predictions -1 } actual <- data$total_fishing_hours# Basic Error Metrics mae <-mean(abs(actual - predictions), na.rm =TRUE) rmse <-sqrt(mean((actual - predictions)^2, na.rm =TRUE)) mape <-mean(abs((actual - predictions) / actual) *100, na.rm =TRUE) medae <-median(abs(actual - predictions), na.rm =TRUE)# R-squared (matching randomForest's % Var explained) r_squared <- model$rsq[length(model$rsq)]# Adjusted R-squared n <-length(actual) p <-length(model$forest$independent.variable.names) # Number of predictors adj_r_squared <-1- ((1- r_squared) * (n -1) / (n - p -1))# Residual Analysis residuals <- actual - predictions mean_residual <-mean(residuals, na.rm =TRUE) sd_residual <-sd(residuals, na.rm =TRUE)# Feature Importance (for Random Forest) feature_importance <-importance(model)return(list("Mean Absolute Error"= mae,"Root Mean Squared Error"= rmse,"Mean Absolute Percentage Error"= mape,"Median Absolute Error"= medae,"R-Squared"= r_squared,"Adjusted R-Squared"= adj_r_squared,"Mean of Residuals"= mean_residual,"Standard Deviation of Residuals"= sd_residual,"Feature Importance"= feature_importance ))}validation_data <- combined_data_with_rasters %>%mutate(data_category =case_when( has_AIS & has_SAR ~"Both AIS and SAR", has_AIS &!has_SAR ~"Only AIS",!has_AIS & has_SAR ~"Only SAR",TRUE~"No fishing detected" ) )# Evaluate all modelsvalidation_data <- validation_data %>%filter(data_category =="Both AIS and SAR")results_rf_env <-evaluate_model(rf_model_env, validation_data)# Create a dataframe for the tableresults_table <-data.frame(Metric =c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error","Median Absolute Error", "R-Squared", "Adjusted R-Squared","Mean of Residuals", "Standard Deviation of Residuals"),Value =round(c(results_rf_env$`Mean Absolute Error`, results_rf_env$`Root Mean Squared Error`, results_rf_env$`Mean Absolute Percentage Error`, results_rf_env$`Median Absolute Error`, results_rf_env$`R-Squared`, results_rf_env$`Adjusted R-Squared`, results_rf_env$`Mean of Residuals`, results_rf_env$`Standard Deviation of Residuals`),2) # Round to 2 decimal places)# Create and display the tablekable(results_table, format ="html", digits =4, caption ="Model Evaluation Metrics") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center")# For feature importance, create a separate tablefeature_importance <-as.data.frame(results_rf_env$`Feature Importance`)feature_importance$Feature <-rownames(feature_importance)feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]colnames(feature_importance) <-c("Feature", "%IncMSE", "IncNodePurity")# Sort the feature importance table by %IncMSE in descending orderfeature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]kable(feature_importance, format ="html", digits =4, col.names =c("Feature", "%IncMSE", "IncNodePurity"),caption ="Feature Importance") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:3, width ="150px")```### Test for spatial auto-correlation ```{r}##------------ Create a non-random test set # Prepare the datadata <- combined_data_with_rasters %>%filter(has_AIS & has_SAR) %>% dplyr::select(total_fishing_hours, total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy) %>%na.omit()# Split the data into training and test setsset.seed(123) # for reproducibilitytrain_indices <-sample(1:nrow(data), 0.7*nrow(data))train_data <- data[train_indices, ]test_data <- data[-train_indices, ]# Train the random forest model#set.seed(123) # for reproducibility#rf_timing <- system.time({# rf_model_env <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )#})#cat("Random Forest model training time:", rf_timing["elapsed"], "seconds\n")# Save the new model#rf_model_env_train=rf_model_env#save(rf_model_env_train, file = "rf_model_01deg_with_rasters_train.Rdata")load(here::here("rf_model_01deg_with_rasters_train.Rdata"))# Make predictions on the test settest_data$predictions <-predict(rf_model_env_train, test_data)# Calculate residualstest_data$residuals <- test_data$total_fishing_hours - test_data$predictions# Prepare spatial data for autocorrelation analysistest_data_sf <-st_as_sf(test_data, coords =c("lon_std", "lat_std"), crs =4326)# Create a neighbors listk <-8# You can adjust this numbernb <-knearneigh(st_coordinates(test_data_sf), k = k)nb <-knn2nb(nb)# Create a spatial weights matrixlw <-nb2listw(nb, style="W")# Perform Moran's I test on the test set residualsmoran_test <-moran.test(test_data_sf$residuals, lw)# Create a data frame with the test results moran_results <-data.frame(Statistic =c("Moran I statistic", "Expectation", "Variance", "Standard deviate", "P-value"),Value =c(round(moran_test$estimate[1],2),round(moran_test$estimate[2],2),round(moran_test$estimate[3],2),round(moran_test$statistic,2),ifelse(moran_test$p.value <2.2e-16, "< 2.2e-16", format(moran_test$p.value, scientific =TRUE, digits =4)) ) )# Create and display the tablekable(moran_results, format ="html", col.names =c("Statistic", "Value"),caption ="Moran I Test under randomisation") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE,position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2, width ="15em")```### Spatial cross-validation ```{r}# Assuming your training_data has lon_std (longitude) and lat_std (latitude) columnscoords <-SpatialPoints(training_data[, c("lon_std", "lat_std")])# Combine your training data with spatial coordinatesspatial_data <-SpatialPointsDataFrame(coords, data = training_data)#Testing to split the coordinates with k-means clustering# Ensure spatial_data is an sf objectif (!inherits(spatial_data, "sf")) { spatial_data_sf <-st_as_sf(spatial_data)} else { spatial_data_sf <- spatial_data}# Extract coordinatescoords <-st_coordinates(spatial_data_sf)# Perform k-means clusteringset.seed(123) # for reproducibilitykmeans_result <-kmeans(coords, centers =6)# Add cluster assignments to the sf objectspatial_data_sf$block <-as.factor(kmeans_result$cluster)# Create the plotggplot() +geom_sf(data = spatial_data_sf, aes(color = block), size =1) +scale_color_viridis_d(name ="Block") +theme_minimal() +labs(title ="Spatial Blocks for 6-Fold Cross-Validation",subtitle ="Points colored by block assignment") +theme(legend.position ="bottom")# Register cores for parallel processing (use 1 less than your total cores)#cl <- makeCluster(detectCores() - 1)#registerDoParallel(cl)# Initialize a list to store results#results <- foreach(i = 1:6, .packages = c("randomForest", "dplyr"), .export = "spatial_data_sf") %dopar% {# Get the indices of the training and test sets for the i-th block# test_indices <- spatial_data_sf$block == i# train_data <- spatial_data_sf[!test_indices, ]# test_data <- spatial_data_sf[test_indices, ]# Train Random Forest on the training data# rf_model <- randomForest(# total_fishing_hours ~ total_presence_score + lon_std + lat_std + dist_shore + dist_ports + bathy,# data = train_data,# ntree = 500,# importance = TRUE# )# Save the model# model_filename <- paste0("rf_model_fold_", i, ".rds")# saveRDS(rf_model, file = model_filename)# Predict on the test data# predictions <- predict(rf_model, test_data)# Return a list with results and model filename# list(# results = data.frame(observed = test_data$total_fishing_hours, predicted = predictions, fold = i),# model_filename = model_filename# )#}# Stop the cluster once done#stopCluster(cl)# Extract results and model filenames#results_df <- do.call(rbind, lapply(results, function(x) x$results))#model_filenames <- sapply(results, function(x) x$model_filename)# Save results#save(results_df, file = "spatial_cv_results.RData")#save(model_filenames, file = "model_filenames.RData")#load(here::here("spatial_cv_results.RData"))#load(here::here("model_filenames.RData"))# Ensure spatial_data and results_df are aligned#spatial_data <- spatial_data[order(row.names(spatial_data)), ]#results_df <- results_df[order(row.names(results_df)), ]# Get all coordinates#all_coordinates <- coordinates(spatial_data)# Create neighbor list for the entire dataset#k <- 30 # Number of nearest neighbors, adjust as needed#nb_all <- knn2nb(knearneigh(all_coordinates, k = k))# Create spatial weights for the entire dataset#lw_all <- nb2listw(nb_all, style="W", zero.policy = TRUE)# Function to calculate Moran's I#calculate_morans_i <- function(residuals, indices, lw_all) {# Create a logical vector for subsetting# subset_vector <- rep(FALSE, length(lw_all$neighbours))# subset_vector[indices] <- TRUE# Subset the weights list for the current fold# lw_subset <- subset.listw(lw_all, subset_vector, zero.policy = TRUE)# Calculate Moran's I# moran_result <- moran.test(residuals, lw_subset, zero.policy = TRUE)# return(moran_result)#}# Calculate Moran's I for each fold in the spatial cross-validation#morans_i_results <- list()#for (i in 1:6) {# Get the indices for this fold# fold_indices <- which(results_df$fold == i)# Calculate residuals# fold_residuals <- results_df$observed[fold_indices] - results_df$predicted[fold_indices]# Calculate Moran's I for this fold# morans_i_results[[i]] <- calculate_morans_i(fold_residuals, fold_indices, lw_all)# Print progress# print(paste("Completed fold", i))#}#save(morans_i_results, file="morans_i_results.Rdata")load(here::here("morans_i_results.Rdata"))# Print results#print("Moran's I for each fold in spatial cross-validation:")#for (i in 1:5) {# print(paste("Fold", i))# print(morans_i_results[[i]])#}# Extract relevant information from Moran's I resultsmorans_table <-data.frame(Fold =1:6,Moran_I =sapply(morans_i_results, function(x) x$estimate[1]),Expectation =sapply(morans_i_results, function(x) x$estimate[2]),Variance =sapply(morans_i_results, function(x) x$estimate[3]),Std_Deviate =sapply(morans_i_results, function(x) x$statistic),P_Value =sapply(morans_i_results, function(x) x$p.value))# Format the tablemorans_table_formatted <- morans_table %>%mutate(Moran_I =round(Moran_I, 4),Expectation =format(Expectation, scientific =TRUE, digits =4),Variance =format(Variance, scientific =TRUE, digits =4),Std_Deviate =round(Std_Deviate, 2),P_Value =ifelse(P_Value <2.2e-16, "< 2.2e-16", format(P_Value, scientific =TRUE, digits =4)) )# Create and print the table using kable and kable_stylingkable(morans_table_formatted,col.names =c("Fold", "Moran's I", "Expectation", "Variance", "Std. Deviate", "p-value"),caption ="Moran's I Results for Each Fold in Spatial Cross-Validation",align =c('c', 'r', 'r', 'r', 'r', 'r'),format ="html") %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed", "responsive"),full_width =FALSE, position ="center") %>%column_spec(1, bold =TRUE) %>%column_spec(2:6, width ="150px")```### Spatially blocked predictions ```{r}# Load necessary librarieslibrary(raster)library(ggplot2)library(viridis)library(dplyr)library(data.table)# Load the model filenamesload(here::here("model_filenames.RData"))# Function to make predictions using all 6 modelsmake_predictions <-function(data) { predictions <-matrix(0, nrow =nrow(data), ncol =6)for (i in1:6) { model <-readRDS(here::here(model_filenames[i])) predictions[, i] <-predict(model, data) }# Take the average prediction across all modelsrowMeans(predictions)}# Prepare the prediction data (same as before)prediction_data <- combined_data_with_rasters %>% dplyr::select(total_presence_score, lon_std, lat_std, dist_shore, dist_ports, bathy)# Make predictions using the spatially blocked modelsnew_predictions <-make_predictions(prediction_data)# Add predictions to the original datasetcombined_data_01deg <- combined_data_01deg %>%mutate(predicted_fishing_hours =case_when( has_AIS ~ total_fishing_hours, has_SAR ~ new_predictions[match(paste(lon_std, lat_std), paste(prediction_data$lon_std, prediction_data$lat_std))],TRUE~0 ) )# Now you can use the same mapping code as before, but it will use the new predictions# Map of predicted fishing hours only map_data <- combined_data_01deg %>%filter(!has_AIS & has_SAR) %>% dplyr::select(lon_std, lat_std, predicted_fishing_hours)# Use your create_region_map function to create the maps# Global mappredicted_SAR_only_plot <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-180, 180), c(-90, 90), "Predicted Fishing Hours in Areas with Only SAR Detections", "0.1 degree resolution (Spatially Blocked Model)")# Caribbean mapcaribbean_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-100, -50), c(0, 40), "Predicted Fishing Hours in the Caribbean", "0.1 degree resolution (Spatially Blocked Model)")# Northwestern Indian Ocean to Western European waters mapindian_european_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(-20, 80), c(0, 70), "Predicted Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.1 degree resolution (Spatially Blocked Model)")# Asia mapasia_map <-create_region_map(map_data, world_map, "lon_std", "lat_std", c(60, 180), c(-20, 60), "Predicted Fishing Hours in Asia", "0.1 degree resolution (Spatially Blocked Model)")# Print the mapsprint(predicted_SAR_only_plot)print(caribbean_map)print(indian_european_map)print(asia_map)# Create the global map of both original and predicted fishing hourspredicted_plot <-ggplot() +geom_map(data = world_map, map = world_map,aes(long, lat, map_id = region),color ="black", fill ="lightgray", size =0.1) +geom_tile(data = combined_data_01deg, aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +scale_fill_viridis(option ="inferno",direction =-1,trans ="log1p",name ="AIS Fishing Effort (hours; 2017-2020)", breaks =c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),labels = scales::comma,guide =guide_colorbar(barwidth =20, barheight =0.5, title.position ="top", title.hjust =0.5) ) +coord_fixed(1.3) +theme_minimal() +labs(title ="Global Fishing Hours (0.1 degree resolution)",subtitle ="Based on AIS data and Spatially Blocked Random Forest predictions from SAR data",x ="Longitude", y ="Latitude") +theme(legend.position ="bottom",legend.direction ="horizontal",legend.box ="vertical",legend.margin = ggplot2::margin(t =20, r =0, b =0, l =0),legend.title =element_text(margin = ggplot2::margin(b =10)) )print(predicted_plot)# Aggregate data to 0.5 degree resolutioncombined_data_05deg <- combined_data_01deg %>%mutate(lon_05deg =round(lon_std *2) /2,lat_05deg =round(lat_std *2) /2 ) %>%group_by(lon_05deg, lat_05deg) %>%summarize(predicted_fishing_hours =sum(predicted_fishing_hours, na.rm =TRUE) ) %>%ungroup()# Global map (0.5 degree)global_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-180, 180), c(-90, 90), "Global Fishing Hours", "0.5 degree resolution (Observed and Predicted)")# Caribbean map (0.5 degree)caribbean_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-100, -50), c(0, 40), "Fishing Hours in the Caribbean", "0.5 degree resolution (Observed and Predicted)")# Northwestern Indian Ocean to Western European waters map (0.5 degree)indian_european_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(-20, 80), c(0, 70), "Fishing Hours from Northern Indian Ocean \nto Eastern Atlantic", "0.5 degree resolution (Observed and Predicted)")# Asia map (0.5 degree)asia_map_05deg <-create_region_map(combined_data_05deg, world_map, "lon_05deg", "lat_05deg", c(60, 180), c(-20, 60), "Fishing Hours in Asia", "0.5 degree resolution (Observed and Predicted)")# Print the mapsprint(global_map_05deg)print(caribbean_map_05deg)print(indian_european_map_05deg)print(asia_map_05deg)```